library(DESeq2)
library(data.table)
library(gdata)
library(rtracklayer)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(readr)
library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(AnnotationDbi)
library(tidyverse)
library(plotly)
dir.create("PDF_figure/T6B_derepression_CDF_merged", showWarnings = FALSE)
peaks.mir <- readRDS("Datafiles/merged-peaks-mirs-200-12232019-withID.rds")
mirs.peaks <- readRDS("Datafiles/miRNA-merged-peaks-list-12232019-withIDs.rds")
T6B_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/T6B mouse colon/Result/Differential Analysis.csv")
## New names:
## * `` -> ...1
## Rows: 12563 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(T6B_DGE)[1] <- "gene"
colnames(T6B_DGE)[3] <- "log2FC"
plotCDF.ggplot <- function(gene.counts, gene.sets, gene.set.labels,
col = "", linetype = "", xlim = c( -1.0, 1.3 ),
legend.size = 22, axistitle.size = 22, title = "Fold change log2 (Dicer KO/WT)",
legend.pos = c(0.7, 0.18)) {
require(ggplot2)
df.log2fc <- gene.counts[,c("gene", "log2FC")]
#rownames(df.log2fc) <- df.log2fc$gene
if (length(gene.sets) != length(gene.set.labels)){
return("Length of gene sets doesn't match labels")
}
target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
for (i in 2:length(gene.sets)){
target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
}
gene.set.counts <- c()
for (j in 1:length(gene.sets)){
gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
}
target.expr$Category <- rep(gene.set.labels, gene.set.counts)
target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
log2FC.values <- lapply(gene.sets, function(gene.set) {
gene.counts[gene.counts$gene %in% gene.set,]$log2FC
})
ks.pvals <- lapply(log2FC.values,
function(log2FCs) {
ks.test(log2FCs, log2FC.values[[1]])$p.value
})
p <- ggplot( target.expr, aes( x = log2FC, colour = Category ) ) +
stat_ecdf( geom = 'step', aes( colour = Category, linetype = Category ), lwd = 1 ) +
scale_color_manual( values = col, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
scale_linetype_manual( values = linetype, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
# xlim() will remove data points; Be careful in the future
coord_cartesian( xlim = xlim ) + xlab(title) + ylab('CDF') +
theme_classic() + theme( legend.background = element_rect(fill = NA),
legend.title = element_blank(),
legend.position = legend.pos,
legend.text = element_text(size=legend.size),
legend.key.size = unit(1.5, 'lines'),
axis.title.x = element_text(size=axistitle.size, margin = margin(t = 10)),
axis.title.y = element_text(size=axistitle.size, margin = margin(r = 10)),
axis.text=element_text(size=20),
axis.line = element_line(size = 1), #axis label size
axis.ticks.length = unit(0.3, "cm")) #increase tick length
for (k in 2:length(gene.sets)){
p <- p + annotate(geom = "text", x = -0.9, y = 1-0.08*(k-1), hjust = 0,
label = sprintf("p = %.0e", ks.pvals[k]),
colour = col[k], size = 8)
}
print(p)
}
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(T6B_DGE,
list(T6B_DGE$gene, peaks.mir$target_Ensembl_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "RNA_LFC(T6B/control)",
legend.size = 15
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_miRNAover200.pdf",
height = 8,
width = 10)
plotCDF.ggplot(T6B_DGE,
list(T6B_DGE$gene, peaks.mir$target_Ensembl_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "RNA_LFC(T6B/control)",
legend.size = 15
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
for (i in 1:10) {
plotCDF.ggplot(T6B_DGE,
list(T6B_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1),
title = "RNA_LFC(T6B/control)",
legend.size = 15
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_top10miRNA.pdf",
height = 8,
width = 10)
for (i in 1:10) {
plotCDF.ggplot(T6B_DGE,
list(T6B_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1),
title = "RNA_LFC(T6B/control)",
legend.size = 15
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
zscore.cal <- function(genes = NA, index = NA, expression.dataset){
if (sum(!is.na(genes)) > 0 && is.na(index)){
gene.set <- expression.dataset[expression.dataset$gene %in% genes,]
} else if (!is.na(index)) {
gene.set <- expression.dataset[index,]
} else {
print("Please input gene names or row index.")
}
mu <- mean(expression.dataset$log2FC)
Sm <- mean(gene.set$log2FC)
m <- length(genes)
SD <- sd(expression.dataset$log2FC)
z <- (Sm-mu)*sqrt(m)/SD
return(z)
}
zscores.all <- as.data.frame(sapply(mirs.peaks,
function(targets){
zscore.cal(genes = targets$target_Ensembl_ID, expression.dataset = T6B_DGE)
}))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mirs.peaks, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"
p1 <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
geom_point(colour = "#15CD40", alpha = 0.6) +
xlab("T6B RNA-Seq Z-score") +
ylab("miRNA family abundance") +
theme_bw() +
theme(panel.border = element_rect(),
panel.background = element_blank(),
panel.grid.major = element_line(),
panel.grid.minor = element_line(),
axis.title.x = element_text(size=14, margin = margin(t = 10)),
axis.title.y = element_text(size=14, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_line(size = 0.5),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
##
## Pearson's product-moment correlation
##
## data: zscores.all$z.score and log2(zscores.all$baseMean)
## t = 5.1337, df = 73, p-value = 2.27e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3262018 0.6643385
## sample estimates:
## cor
## 0.515033
pdf("PDF_figure/T6B_derepression_CDF_merged/T6B_CDF_zscore.pdf",
height = 4,
width = 6)
p1
dev.off()
## quartz_off_screen
## 2
ggplotly(p1)
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] plotly_4.10.0 forcats_0.5.1
## [3] stringr_1.4.0 dplyr_1.0.7
## [5] purrr_0.3.4 tidyr_1.1.4
## [7] tibble_3.1.6 tidyverse_1.3.1
## [9] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.16.4
## [11] AnnotationFilter_1.16.0 GenomicFeatures_1.44.2
## [13] AnnotationDbi_1.54.1 readr_2.1.0
## [15] RColorBrewer_1.1-2 biomaRt_2.48.3
## [17] ggplot2_3.3.5 VennDiagram_1.7.0
## [19] futile.logger_1.4.3 BSgenome_1.60.0
## [21] Biostrings_2.60.2 XVector_0.32.0
## [23] rtracklayer_1.52.1 gdata_2.18.0
## [25] data.table_1.14.2 DESeq2_1.32.0
## [27] SummarizedExperiment_1.22.0 Biobase_2.52.0
## [29] MatrixGenerics_1.4.3 matrixStats_0.61.0
## [31] GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
## [33] IRanges_2.26.0 S4Vectors_0.30.2
## [35] BiocGenerics_0.38.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 rjson_0.2.20 ellipsis_0.3.2
## [4] fs_1.5.0 rstudioapi_0.13 farver_2.1.0
## [7] bit64_4.0.5 lubridate_1.8.0 fansi_0.5.0
## [10] xml2_1.3.2 splines_4.1.1 cachem_1.0.6
## [13] geneplotter_1.70.0 knitr_1.36 jsonlite_1.7.2
## [16] Rsamtools_2.8.0 broom_0.7.10 annotate_1.70.0
## [19] dbplyr_2.1.1 png_0.1-7 compiler_4.1.1
## [22] httr_1.4.2 backports_1.4.0 assertthat_0.2.1
## [25] Matrix_1.3-4 fastmap_1.1.0 lazyeval_0.2.2
## [28] cli_3.1.0 formatR_1.11 htmltools_0.5.2
## [31] prettyunits_1.1.1 tools_4.1.1 gtable_0.3.0
## [34] glue_1.5.0 GenomeInfoDbData_1.2.6 rappdirs_0.3.3
## [37] Rcpp_1.0.7 cellranger_1.1.0 jquerylib_0.1.4
## [40] vctrs_0.3.8 crosstalk_1.2.0 xfun_0.28
## [43] rvest_1.0.2 lifecycle_1.0.1 restfulr_0.0.13
## [46] gtools_3.9.2 XML_3.99-0.8 zlibbioc_1.38.0
## [49] scales_1.1.1 vroom_1.5.6 hms_1.1.1
## [52] ProtGenerics_1.24.0 lambda.r_1.2.4 yaml_2.2.1
## [55] curl_4.3.2 memoise_2.0.1 stringi_1.7.5
## [58] RSQLite_2.2.8 highr_0.9 genefilter_1.74.1
## [61] BiocIO_1.2.0 filelock_1.0.2 BiocParallel_1.26.2
## [64] rlang_0.4.12 pkgconfig_2.0.3 bitops_1.0-7
## [67] evaluate_0.14 lattice_0.20-45 labeling_0.4.2
## [70] htmlwidgets_1.5.4 GenomicAlignments_1.28.0 bit_4.0.4
## [73] tidyselect_1.1.1 magrittr_2.0.1 R6_2.5.1
## [76] generics_0.1.1 DelayedArray_0.18.0 DBI_1.1.1
## [79] haven_2.4.3 pillar_1.6.4 withr_2.4.2
## [82] survival_3.2-13 KEGGREST_1.32.0 RCurl_1.98-1.5
## [85] modelr_0.1.8 crayon_1.4.2 futile.options_1.0.1
## [88] utf8_1.2.2 BiocFileCache_2.0.0 tzdb_0.2.0
## [91] rmarkdown_2.11 progress_1.2.2 readxl_1.3.1
## [94] locfit_1.5-9.4 blob_1.2.2 reprex_2.0.1
## [97] digest_0.6.28 xtable_1.8-4 munsell_0.5.0
## [100] viridisLite_0.4.0